#coding:utf8
import re
import json
import pandas as pd

def clean_data(data_dir):

    #读取json_data.txt
    html = open(data_dir).read()
    #用正则表达re式获取所有的评价信息
    json_tests = re.findall(r'"comments":(.+?);fetchJSON', html)
    #开始数据的提取
    json_num = []
     
    for json_test in json_tests:
        json_test = json_test[0:-2]
        json_num.append(json_test)
    
    creationTime = []
    nickname = []
    content = []
    score = []
    
    for tmp_json in json_num:     
        a = json.loads(tmp_json)
        if(len(a) == 0):
            continue      
        b = 0
        while b < 10:
            creationTime.append(a[b]['creationTime'])
            nickname.append(a[b]['nickname'])
            a[b]['content'] = a[b]['content'].replace('\n','').replace(' ','')
            content.append(a[b]['content'])
            score.append(a[b]['score'])
            b = b + 1
     
    #将前面提取的各字段信息汇总为table数据表，以便后面分析
    table=pd.DataFrame({'creationTime':creationTime,'nickname':nickname,'content':content,'score':score})
    #将creationTime字段更改为时间格式
    table['creationTime']=pd.to_datetime(table['creationTime'])
    #设置creationTime字段为索引列
    table = table.set_index('creationTime')
    #查看整理完的数据表
    table.head()
    #保存table数据表
    csv_data = data_dir.split(".")[0]+".csv"

    table.to_csv(csv_data,encoding="utf_8_sig")

